Data Preparation
setwd("~/STAT 482")
library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(reshape2)
library(stringr)
library(openxlsx)
library(patchwork)
library(purrr)
library(GGally)
library(plotly)
# turn off scientific notation format output
options(scipen = 999)
# The clean_char function takes in a character and returns a number without the range in brackets and without spaces in the mean.
# For example, "193 232 [149 000-245 000]" becomes 193232
clean_char = function(text){
return(as.numeric(gsub(" ", "", sub("\\[.*", "", text))))
}
Population per Country
# Import Subsaharan African countries list
countries = read_excel("~/STAT 482/subsaharan_countries.xlsx", sheet = "Countries", col_names = TRUE)
population = read_excel("~/STAT 482/Population.xls", sheet = "Data", col_names = TRUE)
# filter to get only sub-Saharan African countries
population = population[population$`Country Name` %in% countries$country_name,]
# keep only years 2000-2020, remove Indicator Name and Indicator Code cols.
to_remove = as.character(c(1960:1999,2021:2023, "Indicator Code", "Indicator Name"))
population = population[,!(colnames(population) %in% to_remove)]
Forest Cover Data
# import forest cover data by country and the sub-Saharan countries list
forest = read.csv("forest_coverage_data_by_country.xls - Data.csv")
# take out the first two rows of forest, which contain data we don't need
forest = forest[-c(1,2),]
# make the first row of the data the headers
colnames(forest) = as.character(unlist(forest[1,]))
forest = forest[-1,]
# filter to get only sub-Saharan African countries
forest = forest[forest$`Country Name` %in% countries$country_name,]
# keep only years 2000-2020, remove Indicator Name and Indicator Code cols.
forest = forest[,!(colnames(forest) %in% to_remove)]
Malaria Case Data
# import malaria case number data by country
malaria = read_xlsx("estimated_malaria_cases.xlsx")
# filter to get only Sub-saharan African countries
malaria = malaria[malaria$`Country Name` %in% countries$country_name,]
# loop through all numeric columns in malaria, applying clean_char function
col_mal = colnames(malaria) # list of column names of malaria data
for (i in 2:length(col_mal)){
malaria[col_mal[i]] = sapply(malaria[,i], clean_char)}
# remove column for year 2021
malaria = malaria[,!(colnames(malaria) %in% to_remove)]
# create table with country name and country code for joining
country_name_code = forest[,c(1:2)]
# join the country code to the malaria dataset by the country name
malaria = left_join(malaria, country_name_code, by = "Country Name")
Normalize Malaria Case Data by Population
# Reshape to long format
malaria_long = malaria %>%
pivot_longer(cols = starts_with("20"), names_to = "year", values_to = "cases")
population_long = population %>%
pivot_longer(cols = starts_with("20"), names_to = "year", values_to = "population")
# Join the data
combined_data = malaria_long %>%
left_join(population_long, by = c("Country Name", "Country Code", "year"))
# Normalize the data
normalized_data = combined_data %>%
mutate(normalized_cases = cases / population)
# Reshape back to wide format
malaria = normalized_data %>%
select("Country Name", "Country Code", year, normalized_cases) %>%
pivot_wider(names_from = year, values_from = normalized_cases)
Rainfall Data
rainfall = read_excel("~/STAT 482/rainfall_data.xls", sheet = "Data", col_names = TRUE)
rainfall = rainfall[rainfall$`Country Name` %in% countries$country_name,]
rainfall = rainfall[,!(colnames(rainfall) %in% to_remove)]
GDP per capita data
gdp = read_excel("~/STAT 482/GDP_per_capita.xls", sheet = "Data", col_names = TRUE)
gdp = gdp[gdp$`Country Name` %in% countries$country_name,]
gdp = gdp[,!(colnames(gdp) %in% to_remove)]
Urbanization Rate Data
urbanization = read_excel("~/STAT 482/urbanization_rate_data.xls", sheet = "Data", col_names = TRUE)
urbanization = urbanization[urbanization$`Country Name` %in% countries$country_name,]
urbanization = urbanization[,!(colnames(urbanization) %in% to_remove)]
Combine Datasets
# reorder the 'country' variable in order of most north to most south in latitude
countries_ordered = c("Djibouti", "Eritrea", "Sudan", "Chad", "Niger", "Mali", "Mauritania", "Senegal", "Gambia, The", "Guinea-Bissau", "Guinea", "Sierra Leone", "Liberia", "Burkina Faso", "Ghana", "Togo", "Benin", "Nigeria", "Cameroon", "Central African Republic", "South Sudan", "Ethiopia", "Somalia", "Equatorial Guinea", "Gabon", "Congo, Rep.", "Congo, Dem. Rep.", "Uganda", "Rwanda", "Burundi", "Kenya", "Tanzania", "Sao Tome and Principe", "Angola", "Zambia", "Malawi", "Mozambique", "Zimbabwe", "Namibia", "Botswana", "Eswatini", "South Africa", "Madagascar")
# assign the ordered country names to malaria and forest dataset
malaria$`Country Name` = factor(malaria$`Country Name`, levels = countries_ordered)
forest$`Country Name` = factor(forest$`Country Name`, levels = countries_ordered)
# reshape datasets to long format
malaria_long = malaria %>%
pivot_longer(cols = starts_with("2"), names_to = "Year", values_to = "Malaria_Incidence")
forest_long = forest %>%
pivot_longer(cols = starts_with("2"), names_to = "Year", values_to = "Forest_Cover")
rainfall_long = rainfall %>%
pivot_longer(cols = starts_with("2"), names_to = "Year", values_to = "Rainfall_Depth")
gdp_long = gdp %>%
pivot_longer(cols = starts_with("2"), names_to = "Year", values_to = "GDP_per_Capita")
urban_long = urbanization %>%
pivot_longer(cols = starts_with("2"), names_to = "Year", values_to = "Urbanization_Perc")
# Merge datasets
combined_data = malaria_long %>%
left_join(forest_long, by = c("Country Name", "Year")) %>%
left_join(rainfall_long, by = c("Country Name", "Year")) %>%
left_join(gdp_long, by = c("Country Name", "Year")) %>%
left_join(urban_long, by = c("Country Name", "Year"))
combine_remove = as.character(c("Country Code.x", "Country Code.x.x", "Country Code.y.y", "Country Code.y"))
combined_data = combined_data[,!(colnames(combined_data) %in% combine_remove)]
# View combined dataset
View(combined_data)
write.xlsx(combined_data, "combined_data.xlsx")
Exploratory Data Analysis
Scatter plots of Malaria Incidence and Forest Cover Percentage by
Country
# make the year numeric for plotting
combined_data$Year = as.numeric(combined_data$Year)
# forest scatterplot of Year vs Forest Cover Percentage by Country
ggplot(data = combined_data, aes(x = Year,
y = Forest_Cover,
color = `Country Name`)) +
geom_point(size = 3) +
labs(title = "Year vs Forest Cover Percentage",
x = "Year",
y = "Forest Cover Percentage") +
theme(
legend.text = element_text(size = 10),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
plot.title = element_text(size = 20),
axis.text = element_text(size = 9)
) +
guides(color = guide_legend(ncol = 1, title = NULL))

# malaria scatterplot of Year vs Malaria Incidence by Country
ggplot(data = combined_data, aes(x = Year,
y = Malaria_Incidence,
color = `Country Name`)) +
geom_point(size = 3) +
labs(title = "Year vs Malaria Incidence",
x = "Year",
y = "Malaria Incidence") +
theme(
legend.text = element_text(size = 10),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
plot.title = element_text(size = 20),
axis.text = element_text(size = 9)
) +
guides(color = guide_legend(ncol = 1, title = NULL))

Summary Data of Malaria Incidence and Forest Cover Percentage by
Country
# summarize malaria incidence data by country - rows are sorted by mean incidence
View(combined_data %>%
group_by(`Country Name`) %>%
summarize(
Min_Incidence = min(Malaria_Incidence),
Max_Incidence = max(Malaria_Incidence),
Mean_Incidence = mean(Malaria_Incidence),
Median_Incidence = median(Malaria_Incidence),
SD_Incidence = sd(Malaria_Incidence)) %>%
arrange(Mean_Incidence)
)
# summarize forest cover % by country - rows are sorted by mean forest cover %
View(combined_data %>%
group_by(`Country Name`) %>%
summarize(
Min_Perc_Cover = min(Forest_Cover),
Max_Perc_Cover = max(Forest_Cover),
Mean_Perc_Cover = mean(Forest_Cover),
Median_Perc_Cover = median(Forest_Cover),
SD_Perc_Cover = sd(Forest_Cover)) %>%
arrange(Mean_Perc_Cover))
Histograms of Malaria Incidence and Forest Cover Percentage by
Country
# plot the histograms of forest cover for each country
ggplot(data = combined_data, aes(x = Forest_Cover)) +
geom_histogram(fill = "forest green", color = "black") +
labs(title = "Histogram of Forest Cover Percentage by Country", x = "Forest Cover Percentage", y = "Frequency") +
theme(
plot.title = element_text(size = 20),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text = element_text(size = 9)
) +
facet_wrap(~ `Country Name`, ncol = 3, scales = "free_x")

# plot the histograms of malaria incidence for each country
ggplot(data = combined_data, aes(x = Malaria_Incidence)) +
geom_histogram(fill = "forest green", color = "black") +
labs(title = "Histogram of Malaria Incidence by Country", x = "Malaria Incidence", y = "Frequency") +
theme(
plot.title = element_text(size = 20),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text = element_text(size = 9)
) +
facet_wrap(~ `Country Name`, ncol = 3, scales = "free_x")

Box plots of Malaria Incidence and Forest Cover Percentage by
Country (one plot)
# boxplot of malaria incidence by country - all in one plot
ggplot(data = combined_data, aes(x = `Country Name`, y = Malaria_Incidence)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.colour = "red", outlier.shape = 16) +
labs(title = "Boxplot of Malaria Incidence by Country", x = "Country", y = "Malaria Incidence") +
theme_minimal() +
theme(
plot.title = element_text(size = 20),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1))

# boxplot of forest cover percentage by country - all in one plot
ggplot(data = combined_data, aes(x = `Country Name`, y = Forest_Cover)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.colour = "red", outlier.shape = 16) +
labs(title = "Boxplot of Forest Cover Percentage by Country", x = "Country", y = "Forest Cover Percentage") +
theme_minimal() +
theme(
plot.title = element_text(size = 20),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1))

Box plots of Malaria Incidence and Forest Cover Percentage by
Country (separate plots)
# boxplot of malaria incidence by country - separate plots
ggplot(data = combined_data, aes(x = Forest_Cover)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.color = "red") +
labs(title = "Boxplot of Forest Cover Percentage by Country", x = "", y = "") +
theme(
plot.title = element_text(size = 20),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.y = element_blank()
) +
facet_wrap(~ `Country Name`, ncol = 3, scales = "free_x")

# boxplot of malaria incidence by country - separate plots
ggplot(data = combined_data, aes(x = Malaria_Incidence)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.color = "red") +
labs(title = "Boxplot of Malaria Incidence by Country", x = "", y = "") +
theme(
plot.title = element_text(size = 20),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.y = element_blank()
) +
facet_wrap(~ `Country Name`, ncol = 3, scales = "free_x")

Heat map of Malaria Incidence and Forest Cover Percentage by
Country
# format forest cover data for heatmap compatability
forest_long = melt(combined_data, id.vars = c("Year", "Country Name"),
measure.vars = "Forest_Cover")
ggplot(forest_long, aes(x = Year, y = `Country Name`, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "forest green") +
labs(title = "Heatmap of Forest Cover Percentage by Year and Country",
x = "Year",
y = "Country",
fill = "Forest Cover Percentage") +
theme_minimal() +
theme(
plot.title = element_text(size = 20),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text = element_text(size = 12))

# format malaria data for heatmap compatability
malaria_long = melt(combined_data, id.vars = c("Year", "Country Name"),
measure.vars = "Malaria_Incidence")
ggplot(malaria_long, aes(x = Year, y = `Country Name`, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "red") +
labs(title = "Heatmap of Malaria Incidence by Year and Country",
x = "Year",
y = "Country",
fill = "Malaria Incidence") +
theme_minimal() +
theme(
plot.title = element_text(size = 20),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text = element_text(size = 12))
